## script for cleaning Indonesia sample from Facebook sampling survey ##
# preamble #### 
rm(list=ls())

library(foreign)
library(survey)
library(xtable)
library(readxl)
library(haven)
library(magrittr)
library(tidyverse)


setwd("~/Dropbox/facebook sampling/replication/indonesia replication")

# load and clean facebook data #### 
d<-read_csv("indonesia data/indonesia_facebook_wave1-2_deidentified.csv")

vars<-colnames(d)
varinfo<-as.matrix(d[1,])
varinfo<-t(varinfo)

## recode and clean up demographics 
d%<>%
  dplyr::rename("Petition"=pol_actions1,
         "OnlinePost"=pol_actions2,
         "Meeting"=pol_actions3,
         "Protested"=pol_actions4)%>%
  dplyr::mutate(`Petition`=
           case_when(grepl("I have done this",`Petition`)==TRUE~"Yes",
                     grepl("I have not done this",`Petition`)==TRUE~"No",
                     .default="DK"),
         `OnlinePost`=
           case_when(grepl("I have done",`OnlinePost`)==TRUE~"Yes",
                     grepl("I have not done",`OnlinePost`)==TRUE~"No",
                     .default="DK"),
         `Meeting`=
           case_when(grepl("I have done",`Meeting`)==TRUE~"Yes",
                     grepl("I have not done",`Meeting`)==TRUE~"No",
                     .default="DK"),
         `Protested`=
           case_when(grepl("I have done",`Protested`)==TRUE~"Yes",
                     grepl("I have not done",`Protested`)==TRUE~"No",
                     .default="DK"),
         degreerecode=case_when(demo_degree%in%c("S1/Bachelor","S2/Master","S3/PhD","D4 (4 years Diploma)",
                                               "Profession","D1/D2 (1 Year Diploma/2 Years Diploma)",
                                               "D3 (3 Years Diploma)")~"Postsecondary",
                                demo_educ=="No formal education"~"nosecondary",
                              demo_degree%in%c("Don't have an elementary school diploma",
                                               "Package A (Elementary School Equivalency)",
                                               "SDLB (ES for Student with Disability)","MI (Madrasah Ibtidaiyah/Faith Based ES)",
                                               "SD (ES)","Package B  (Middle School Equivalency)",
                                               "Middle School (Junior High)","LB Middle School (Middle School for Student with Disabilities)",
                                               "MTs (Madrasah Tsanawiyah/Faith based Middle School)")~"nosecondary",
                              demo_degree%in%c("High School","MA (Madrasah Aliyah/Faithbased Highschool)",
                                               "Package C ( High School Equivalency)",
                                               "SMK (TVET HIghschool)","MAK (TVET MA)",
                                               "SMLB (High School for Student with Disabilities)")~"secondary",
                              TRUE~demo_degree),
         dem_edu=case_when(demo_educ=="No formal education"~"None",
                           demo_educ=="Primary education"~"Primary",
                           demo_educ=="Secondary education (high school)"~"Secondary",
                           demo_educ=="Tertiary education (college or university degree)"~"University"),
         dem_rel=ifelse(dem_rel%in%c("Others (write down)","Penghayat Kepercayaan"),"Other",dem_rel)
         )

## check values
table(d$demo_age)
table(d$demo_gender)
table(d$gender)
sort(unique(d$region))
sort(unique(d$geo_regcity_1))

# regions<-d%>%select(geo_regcity_1)%>%arrange(geo_regcity_1)%>%distinct() ## extract out provinces that people chose as their residence and save, so that I can match with the ad set-targeted provinces
# write_csv(regions,file="data_cleaned/region_key_indonesia.csv") ## manually match (in excel) with the facebook ad-targeted region corresponding to each selection
regions<-read_csv("indonesia data/region_key_indonesia_wfbrecodes.csv") ## load this in 
d%<>%left_join(regions,by="geo_regcity_1") ## join with main dataset so that I can determine whether fb-targeted and self-selected residences match 

## rename columns in d to avoid confusion 
d%<>%
  dplyr::rename(demo_region=geo_regcity_1,
         fb_gender=gender,
         fb_region=region,
         demo_region_recode=geo_regcity1_fbcode, ## rename self-selected province to so that values are pulled from the ad-set values. 
         fb_age=age)

d%<>%dplyr::mutate(
  fb_age=as.numeric(fb_age),
  demo_age=as.numeric(demo_age),
  demo_age=ifelse(is.na(demo_age),fb_age,demo_age),## impute missing sample demographics from facebook-assigned ones #### 
  demo_gender_orig=demo_gender,
  demo_gender=case_when(demo_gender=="Non-binary/other"&!is.na(fb_gender)~fb_gender,## give people their FB-assigned gender, or randomly assign people who chose non-binary or other in gender category to m/f so they can be included in the analysis 
                        demo_gender=="Non-binary/other"&is.na(fb_gender)~sample(c("Female","Male"),1,replace=TRUE),
                        demo_gender=="Male"|demo_gender=="Female"~demo_gender),
  fb_agegroup=case_when(fb_age<30~"21-29",
                        fb_age>=30&fb_age<50~"30-49",
                        fb_age>=50&fb_age<60~"50-59",
                        fb_age>=60~"60+"),
  demo_agegroup=case_when(demo_age>20&demo_age<30~"21-29", 
                          demo_age>=30&demo_age<50~"30-49",
                          demo_age>=50&demo_age<60~"50-59",
                          demo_age>=60~"60+"),
  demo_agegroup_orig=demo_agegroup, ## save this as _orig agegroup variable to exclude 18-year olds for comparison with facebook cells
  demo_agegroup=ifelse(demo_agegroup=="21-29","18-29",demo_agegroup), ## create new agegroup variable that includes 18-year-olds
  demo_agegroup=ifelse(demo_age>=18&demo_age<=20,"18-29",demo_agegroup),
  fb_gender=str_to_title(fb_gender),
  demo_gender=str_to_title(demo_gender)
)



d%<>%dplyr::mutate(leftright=case_when(grepl("left",pol_leftright,ignore.case=TRUE)~"Left",
                                grepl("right",pol_leftright,ignore.case=TRUE)~"Right",
                                pol_leftright%in%c("Center","Don't know/don't have political views")~"Centrist"),
            party=gsub(" ","",as.character(pol_party_asianbaro)),
            party=gsub("[[:punct:]]","",party),
            gw_important=gsub(" ","",gw_important),
            gw_human=case_when(gw_human=="Caused mostly by human activities"~"Human",
                               gw_human=="Caused mostly by natural changes in the environment"~"Natural",
                               grepl("None of the above",gw_human)~"Nothappening"))
d%<>%dplyr::mutate(presidentparty=ifelse(party=="PDIP","pdip","otherparty"),
            frequentvoter=ifelse(asiabarometer_voting%in%c("Voted in every election","Voted in most elections"),"frequent","infrequent"))
d%<>%dplyr::mutate(gw_happen=ifelse(grepl("know",gw_happen)==TRUE,"DK",gw_happen))%>%
  dplyr::rename(gwhappening=gw_happen,
         gwhuman=gw_human,
         gwimportant=gw_important)
d%<>%dplyr::rename(voteintention=turnout_intended,
            votelastelection=turnout_lastelec)



d%<>%
  dplyr::mutate(votelastelection=case_when(grepl("remember whether",votelastelection)==TRUE~"DK",
                                    grepl("did not",votelastelection)==TRUE~"No",
                                    grepl("voted in the",votelastelection)==TRUE~"Yes",
                                    grepl("too young",votelastelection)==TRUE~NA)) ## note that this will be percent of the voting age population who voted 
## check values
table(d$gwimportant)
table(d$gwhuman)
table(d$gwhappening)
table(d$presidentparty)


d%<>%dplyr::mutate(duration.minutes=`Duration (in seconds)`/60)
incompletes<-d%>%dplyr::filter(complete==0|duration.minutes<5)%>%
  dplyr::select(duration.minutes,wave,fb_agegroup,fb_gender,fb_region)


# missing strata ####
wave1 <- d %>% filter(wave == "wave1")
wave2 <- d %>% filter(wave == "wave2")

# complete list of strata
regionlist <- unique(regions$geo_regcity1_fbcode)
genderlist <- c("Male", "Female")
age_groupslist <- c("18-29", "30-49", "50-59", "60+")

fb_total_strata <- expand.grid(Region = regionlist, Gender = genderlist, AgeGroup = age_groupslist) %>% filter(!is.na(Region))
fb_total_strata$strata <- apply(fb_total_strata, 1, function(x) paste(x, collapse = "_"))

w1_strata <- wave1 %>% group_by(demo_agegroup, demo_region_recode, demo_gender) %>% dplyr::summarise(n=n())
w2_strata <- wave2 %>% group_by(demo_agegroup, demo_region_recode, demo_gender) %>% dplyr::summarise(n=n())

w1_strata$strata <- apply(w1_strata[, c("demo_region_recode", "demo_gender", "demo_agegroup")], 1, function(x) paste(x, collapse = "_"))
w2_strata$strata <- apply(w2_strata[, c("demo_region_recode", "demo_gender", "demo_agegroup")], 1, function(x) paste(x, collapse = "_"))

missing_strata_w1 <- data.frame (missing_strata_w1 = setdiff(fb_total_strata$strata, w1_strata$strata))
missing_strata_w1 <- missing_strata_w1 %>%
  separate(missing_strata_w1, into = c("temp_missing_region", "temp_missing_gender", "temp_missing_agegroup"), sep = "_", remove = FALSE) %>%
  dplyr::rename(missing_region = temp_missing_region,
         missing_gender = temp_missing_gender,
         missing_agegroup = temp_missing_agegroup)
# 85 missing strata from wave 1, an additional 117 had fewer than 10 responses
# 70 strata filled with 10 or more obs

missing_strata_w2 <- data.frame (missing_strata_w2 = setdiff(fb_total_strata$strata, w2_strata$strata))
missing_strata_w2 <- missing_strata_w2 %>%
  separate(missing_strata_w2, into = c("temp_missing_region", "temp_missing_gender", "temp_missing_agegroup"), sep = "_", remove = FALSE) %>%
  dplyr::rename(missing_region = temp_missing_region,
         missing_gender = temp_missing_gender,
         missing_agegroup = temp_missing_agegroup)
# 73 missing strata from wave 2, an additional 109 had fewer than 10 responses
# 90 strata filled with 10 or more obs
write_csv(missing_strata_w2,file="indonesia data/indonesia_empty_adcells.csv")

d%<>%
  dplyr::filter(duration.minutes>=5 & complete==1)

save(d,incompletes,file="indonesia data/indonesia_facebook_cleaned.Rda")

# recode and gather census data ####
census <- read_excel("indonesia data/province x age x gender.xlsx")
census<-census[5:38,]

gender_colnames <- rep(c("male", "female", "Total Gender"), times = 21)
age_groups <- c("00-04", "05-09", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40-44", "45-49", 
                "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85-89", "90-94", "95+", 
                "Total Age Group")
age_colnames <- rep(age_groups, each = 3)
gender_age_colnames <- paste(gender_colnames, age_colnames, sep = "_")
region_gender_age_colnames <- c("region_census", gender_age_colnames)
names(census) <- region_gender_age_colnames

region_crosswalk <- read_csv("indonesia data/region_census_crosswalk.csv")

census <- census %>% left_join(region_crosswalk) %>% select(-c(region_census)) %>% select(last_col(), everything()) 

census_gathered <- census %>% gather(key = "Variable", value = "Value", -region)
census_gathered <- census_gathered %>% separate(Variable, into = c("gender", "age"), sep = "_")
census_gathered <- census_gathered %>% filter(!str_detect(gender, "Total")) %>% filter(!str_detect(age, "Total"))
census_gathered$Value <- as.numeric(census_gathered$Value)

write.csv(census_gathered, "indonesia data/indonesia_census_cleaned.csv")

# set up weights for the main analysis #### 
## if starting from here: 
load("indonesia data/indonesia_facebook_cleaned.Rda")

d%<>%filter(!is.na(degreerecode))



## create marginal distributions from census ####
wave2<-d%>%filter(wave=="wave2")
census<-read_csv("indonesia data/indonesia_census_cleaned.csv")
table(census$age)
census%<>%
  filter(age%in%c("00-04","05-09","10-14")==FALSE)%>%
  mutate(Value=ifelse(age=="15-19",Value*.4,Value))## adjust the 15-19 cell since our youngest respondents are only 18
centotals<-sum(census$Value)
region<-census%>%select(region,Value)%>%
  group_by(region)%>%
  summarise(Freq=sum(Value))%>%
  ungroup()%>%
  mutate(Freq=nrow(wave2)*Freq/centotals)%>% ## multiply the proportion of each cell by nrows in the sample so that we have the number of people in each cell 
  rename(demo_region_recode=region) ## rename so that the name is consistent with the region code (named acc to facebook's cells) in the sample data
age<-census%>%
  select(age,Value)%>%
  mutate(age=case_when(age%in%c("15-19","20-24","25-29")~"18-29",
                       age%in%c("30-34","35-39","40-44","45-49")~"30-49",
                       age%in%c("50-54","55-59")~"50-59",
                       age%in%c("60-64","65-69","70-74","75-79","80-84","85-89","90-94","95+")~"60+"))%>%
  group_by(age)%>%
  summarise(Freq=sum(Value))%>%
  ungroup()%>%
  mutate(Freq=nrow(wave2)*Freq/centotals)%>%
  rename(demo_agegroup=age)
gender<-census%>%
  select(gender,Value)%>%
  group_by(gender)%>%
  summarise(Freq=sum(Value))%>%
  ungroup()%>%
  mutate(Freq=nrow(wave2)*Freq/centotals,
         gender=str_to_title(gender))%>%
  rename(demo_gender=gender)

## educational degree data from census 
degree.census<-read_csv("indonesia data/education.csv")
totalpop<-degree.census$Total[degree.census$Province=="TOTAL"]
degree.census%<>%
  mutate(Province=gsub('[0-9.]','',Province),
         Province=gsub('[[:punct:]]+','',Province),
         Province=str_to_title(Province))%>%
  mutate(Province=str_trim(Province))%>%
  select(-Total)%>%
  filter(Province!="Total")
degree.census%<>%mutate(Province=case_when(Province=="Dki Jakarta"~"Daerah Khusus Ibu Kota Jakarta",
                                           Province=="Di Yogyakarta"~"Daerah Istimewa Yogyakarta",
                                           TRUE~Province))
degree.census%<>%
  summarise(across(!Province,~sum(.)))
degree.census<-data.frame(value=t(degree.census))
degree.census$demo_degree<-rownames(degree.census)

## this is the summary we'll use later in descriptive figures 
degree.census%<>%
  mutate(term=case_when(demo_degree%in%c("No school","Didn't finish elementary","SD/SDLB/MI/Package A",
                                         "SMP/SMPLB/MTs/PackageB")~"nosecondary",
                        demo_degree=="SMA/SMALB/MA/SMK/MAK/Pakage C"~"secondary",
                        demo_degree%in%c("DI/DII/DIII","DIV/S1","Profession","S2/S3")~"Postsecondary"))%>%
  group_by(term)%>%
  summarise(estimate=sum(value))%>%
  ungroup()%>%
  mutate(estimate=estimate/totalpop,
         std.error=0,sample="Census",var="Degree")
## create the summary we'll use for weights 
degree<-degree.census%>%
  select(term,estimate)%>%
  mutate(estimate=nrow(wave2)*estimate)%>%
  rename(degreerecode=term,
         Freq=estimate)
save(gender,age,degree,region,degree.census,centotals,file="indonesia data/weightingdata_indonesia.Rda")

## create weights for wave 1 facebook data #### 
wave1<-d%>%filter(wave=="wave1")
design1<-svydesign(ids=~0,data=wave1)
## re-create weighting data to match wave 1 rather than wave 2 
degree<-degree.census%>%
  select(term,estimate)%>%
  mutate(estimate=nrow(wave1)*estimate)%>%
  rename(degreerecode=term,
         Freq=estimate)
region<-census%>%select(region,Value)%>%
  group_by(region)%>%
  summarise(Freq=sum(Value))%>%
  ungroup()%>%
  mutate(Freq=nrow(wave1)*Freq/centotals)%>% ## multiply the proportion of each cell by nrows in the sample so that we have the number of people in each cell 
  rename(demo_region_recode=region) ## rename so that the name is consistent with the region code (named acc to facebook's cells) in the sample data
age<-census%>%
  select(age,Value)%>%
  mutate(age=case_when(age%in%c("15-19","20-24","25-29")~"18-29",
                       age%in%c("30-34","35-39","40-44","45-49")~"30-49",
                       age%in%c("50-54","55-59")~"50-59",
                       age%in%c("60-64","65-69","70-74","75-79","80-84","85-89","90-94","95+")~"60+"))%>%
  group_by(age)%>%
  summarise(Freq=sum(Value))%>%
  ungroup()%>%
  mutate(Freq=nrow(wave1)*Freq/centotals)%>%
  rename(demo_agegroup=age)
gender<-census%>%
  select(gender,Value)%>%
  group_by(gender)%>%
  summarise(Freq=sum(Value))%>%
  ungroup()%>%
  mutate(Freq=nrow(wave1)*Freq/centotals,
         gender=str_to_title(gender))%>%
  rename(demo_gender=gender)
rake.wave1<-rake(design=design1,
                 sample.margins = list(~demo_gender,~demo_agegroup,~demo_region_recode,~degreerecode
                 ),
                 population=list(gender,age,region,degree
                 ),control=list(maxit=100)) 
quantile(weights(rake.wave1),probs=c(.05,.5,.95))


rake.wave1 <- trimWeights(rake.wave1,lower=-Inf, 
                     upper=quantile(weights(rake.wave1),probs=c(.95)),strict=TRUE) ## trim weights at the top
wave1$weight_trimmed<-weights(rake.wave1)# 
wave1%<>%select(ResponseId,weight_trimmed)


## create weights for wave 2 facebook data for main analysis ####
load("indonesia data/weightingdata_indonesia.Rda")
design<-svydesign(ids=~0,data=wave2)
rake1<-rake(design=design,
            sample.margins = list(~demo_gender,~demo_agegroup,~demo_region_recode,~degreerecode
            ),
            population=list(gender,age,region,degree
            ),control=list(maxit=100)) 
quantile(weights(rake1),probs=c(.05,.5,.95))

wave2$weight<-weights(rake1)


rake1 <- trimWeights(rake1, lower=-Inf,#quantile(weights(rake1),probs=c(.05)),  
                     upper=quantile(weights(rake1),probs=c(.95)),strict=TRUE) ## trim weights at the top 

wave2$weight_trimmed<-weights(rake1)


wave2%<>%select(ResponseId,weight_trimmed)

weights<-bind_rows(wave1,wave2)

d%<>%left_join(weights,by="ResponseId")

save(d,incompletes,file="indonesia data/indonesia_facebook_cleaned_weighted.Rda")

# recode and subset Asianbarometer ####
asiabaro <- read_dta("indonesia data/W5_Indonesia_merged_core_20220905_released.dta")
  
asiabaro_pol_actions <- asiabaro %>% 
  select (V_75, V_76, V_78, V_79) %>%
  rename (pol_actions1 = V_75, 
          pol_actions2 = V_76,
          pol_actions3 = V_78,
          pol_actions4 = V_79) %>%
  mutate_all(~case_when(
    . == 1 ~ "I have done this more than three times",
    . == 2 ~ "I have done this two or three times",
    . == 3 ~ "I have done this once",
    . == 4 ~ "I have not done this, but I might do it if something important happens in the future",
    . == 5 ~ "I have not done this and I would not do it regardless of the situation",
    . == 7 ~ "[Do not read] Do not understand the question",
    . == 8 ~ "[Do not read] Can’t choose",
    . == 9 ~ "[Do not read] Decline to answer",
    TRUE ~ as.character(.)
  )) 
  


asiabaro_demos <- asiabaro %>%
  select(Region, SE2, SE3A, SE4, SE5, SE6, V_56, V_57, SE14) %>%
  mutate(
    Region = case_when(
      Region == -1 ~ "Missing",
      Region == 901 ~ "Sumatera",
      Region == 902 ~ "Java",
      Region == 903 ~ "Bali Nusa",
      Region == 904 ~ "Kalimantan",
      Region == 905 ~ "Sulawesi",
      Region == 906 ~ "Maluku Papua",
      TRUE ~ as.character(Region)
    ),
    gender = case_when(
      SE2 == -1 ~ "Missing",
      SE2 == 1 ~ "Male",
      SE2 == 2 ~ "Female",
      TRUE ~ as.character(SE2)
    ),
    marital_status = case_when(
      SE4 == 1 ~ "Not married",
      SE4 == 2 ~ "Married",
      SE4 == 3 ~ "Living together (like marriage) without marriage ties",
      SE4 == 4 ~ "Widow / widower",
      SE4 == 5 ~ "Separate the bed / get married but separate / stay not with a legitimate partner",
      SE4 == 6 ~ "Divorced",
      SE4 == 9 ~ "Refusing to answer",
      TRUE ~ as.character(SE4)
    ),
    education = case_when(
      SE5 == 1 ~ "Never go to school",
      SE5 == 2 ~ "Not completing elementary school / equivalent",
      SE5 == 3 ~ "Elementary School / equivalent",
      SE5 == 4 ~ "Not graduating junior high / equivalent",
      SE5 == 5 ~ "Completed junior high / equivalent",
      SE5 == 6 ~ "Not graduating from high school / equivalent",
      SE5 == 7 ~ "High school graduated / equivalent",
      SE5 == 8 ~ "Not graduating from college / still a student",
      SE5 == 9 ~ "D3 / diploma graduation or bachelor's degree",
      SE5 == 10 ~ "Graduate or higher graduate",
      TRUE ~ as.character(SE5)
    ),
    religion = case_when(
      SE6 == 1 ~ "Islam",
      SE6 == 2 ~ "Catholic",
      SE6 == 3 ~ "Protestant",
      SE6 == 4 ~ "Hindu",
      SE6 == 5 ~ "Buddha",
      SE6 == 6 ~ "Confucianism",
      SE6 == 7 ~ "Others",
      TRUE ~ as.character(SE6)
    ) ,
    # party = case_when(
    #   V_56 == 1 ~ "National Awakening Party",
    #   V_56 == 2 ~ "Great Indonesia Movement Party",
    #   V_56 == 3 ~ "PDI-P", # Indonesian Democratic Party of Struggle
    #   V_56 == 4 ~ "Golkar",
    #   V_56 == 5 ~ "Nasdem Party",
    #   V_56 == 6 ~ "Garuda Party",
    #   V_56 == 7 ~ "Berkarya Party",
    #   V_56 == 8 ~ "Prosperous Justice Party",
    #   V_56 == 9 ~ "Perindo Party",
    #   V_56 == 10 ~ "United Development Party",
    #   V_56 == 11 ~ "Indonesian Solidarity Party",
    #   V_56 == 12 ~ "National Mandate Party",
    #   V_56 == 13 ~ "People's Conscience Party",
    #   V_56 == 14 ~ "Democratic Party",
    #   V_56 == 19 ~ "Crescent Star Party",
    #   V_56 == 20 ~ "Indonesian Justice and Unity Party",
    #   V_56 == 88 ~ "Don’t feel close to any political party",
    #   V_56 == 99 ~ "DK/RA",
    #   TRUE ~ as.character(V_56)
    # ) # for wave1 data matching
    party = case_when(
      V_56 == 1  ~ "pkb",
      V_56 == 2  ~ "Gerindra",
      V_56 == 3  ~ "PDI-P",
      V_56 == 4  ~ "Golkar",
      V_56 == 5  ~ "NasDem",
      V_56 == 6 ~ "Partai Garuda",
      V_56 == 7  ~ "Berkarya",
      V_56 == 8  ~ "pks",
      V_56 == 9 ~ "Perindo",
      V_56 == 10 ~ "ppp",
      V_56 == 11 ~ "psi",
      V_56 == 12 ~ "pan",
      V_56 == 13 ~ "Hanura",
      V_56 == 14 ~ "Demokrat",
      V_56 == 19 ~ "pbb",
      V_56 == 20 ~ "pkpi",
      V_56 == 88 ~ "Don’t feel close to any political party",
      V_56 == 99 ~ "DK/RA",
      #TRUE        ~ as.character(V_56)  # For values not listed
    ),
  party_close = case_when(
    V_57 == 1  ~ "Very close",
    V_57 == 2  ~ "Somewhat close",
    V_57 == 3  ~ "Just a little close",
    V_57 == 7  ~ NA,
    V_57 == 8  ~ "DK",
    V_57 == 9 ~ NA,
    V_57 == 999  ~ NA),
  ) %>%
  rename(
    age = SE3A,
    income = SE14
  ) %>%
  select(-c(SE2, SE4, SE5, SE6, V_56, V_57))

asiabaro_demos%<>%
  mutate(agegroup=case_when(age>=18&age<30~"18-29",
                            age>=30&age<50~"30-49",
                            age>=50&age<60~"50-59",
                            age>=60~"60+"))

asiabaro_demos%<>%
  mutate(dem_edu=case_when(grepl("Completed junior",education)~"Primary",
                           grepl("diploma graduation",education)~"University",
                           grepl("Elementary",education)~"Primary",
                           grepl("Graduate or higher",education)~"University",
                           grepl("High school graduated",education)~"Secondary",
                           grepl("Never",education)~"None",
                           grepl("Not completing elementary",education)~"None",
                           grepl("Not graduating from college",education)~"Secondary",
                           grepl("Not graduating from high",education)~"Primary",
                           grepl("Not graduating junior",education)~"Primary"))

asiabaro_pol_actions%<>%
  rename("Petition"=pol_actions1,
         "OnlinePost"=pol_actions2,
         "Meeting"=pol_actions3,
         "Protested"=pol_actions4)

asiabaro_pol_actions%<>%
  mutate(`Petition`=
           case_when(grepl("I have done this",`Petition`)==TRUE~"Yes",
                     grepl("I have not done this",`Petition`)==TRUE~"No",
                     TRUE ~ "DK"),
         `OnlinePost`=
           case_when(grepl("I have done",`OnlinePost`)==TRUE~"Yes",
                     grepl("I have not done",`OnlinePost`)==TRUE~"No",
                     TRUE ~ "DK"),
         `Meeting`=
           case_when(grepl("I have done",`Meeting`)==TRUE~"Yes",
                     grepl("I have not done",`Meeting`)==TRUE~"No",
                     TRUE ~ "DK"),
         `Protested`=
           case_when(grepl("I have done",`Protested`)==TRUE~"Yes",
                     grepl("I have not done",`Protested`)==TRUE~"No",
                     TRUE ~ "DK"))
table(asiabaro_pol_actions$Protested)
  
asiabaro_voting<-asiabaro%>%
  select(V_81,V_33)%>%
  mutate(votelastelection=case_when(V_33==1~"Yes",
                                    V_33==2~"No",
                                    V_33==8~"DK",
                                    V_33==9~NA),
         votinghabit=case_when(V_81==1~"every",
                               V_81==2~"most",
                               V_81==3~"some",
                               V_81==4~"hardlyever",
                               V_81==5~NA,
                               V_81==7~NA,
                               V_81==8~"DK",
                               V_81==9~NA))%>%
  select(-V_33,-V_81)

asiabaro_vars <- asiabaro_pol_actions %>% bind_cols (asiabaro_demos)%>%bind_cols(asiabaro_voting)

write.csv(asiabaro_vars,"indonesia data/asianbarometer_cleaned.csv")


# recode and subset Dynata ####
dynata <- read_csv("indonesia data/mildenbergerdynata_2021.csv")

dynata_id <- dynata %>% 
  filter(consent == "I agree to participate") %>% 
  filter(attentioncheck == "Somewhat agree") %>%
  select(contains("gw"), lloyds, age, gender, education, INDONESIAparty, region, ideology, relationship, employment_gallup, employ_hrs, gallup_retired, income_1, kids,asiabarometer_voting) %>% 
  rename(gw_threat = lloyds,
         dem_marital = relationship, 
         dem_kids = kids, 
         work_income_1 = income_1,
         work_retired = gallup_retired, 
         work_employed = employment_gallup, 
         work_hours = employ_hrs,
         pol_leftright = ideology,
         demo_educ = education,
         demo_age = age,
         demo_gender = gender) 

dynata_id%<>%
  mutate(demo_agegroup=case_when(demo_age>=18&demo_age<30~"18-29",
                            demo_age>29&demo_age<50~"30-49",
                            demo_age>49&demo_age<60~"50-59",
                            demo_age>59~"60+"))
dynata_id%<>%
  rename(votinghabit=asiabarometer_voting)%>%
  mutate(votinghabit=case_when(votinghabit=="Hampir tidak pernah memilih"~"hardlyever",
                               votinghabit=="Memberikan suara di beberapa pemilihan"~"some",
                               votinghabit=="Memberikan suara di sebagian besar pemilihan"~"most",
                               votinghabit=="Memberikan suara di setiap pemilihan"~"every"))

dynata_id%<>%
  filter(!is.na(demo_educ))%>% ## we lose about 50 respondents here
  mutate(degreerecode=case_when(demo_educ=="No formal education"~"nosecondary",
                             demo_educ=="Primary education"~"nosecondary",
                             demo_educ=="Secondary education (high school)"~"secondary",
                             demo_educ=="Tertiary education (college or university degree)"~"Postsecondary"))
table(dynata_id$region)
setdiff(unique(dynata_id$region),unique(d$demo_region))
setdiff(unique(d$demo_region),unique(dynata_id$region))
regionrecodes<-d%>%
  select(demo_region,demo_region_recode)%>%distinct()
dynata_id%<>%mutate(region=case_when(region=="Jakarta Raya"~"Daerah Khusus Ibu Kota Jakarta",
                                  region=="Yogyakarta"~"Daerah Istimewa Yogyakarta",
                                  TRUE~region))%>%
  filter(!is.na(region),!is.na(demo_agegroup),!is.na(demo_age),!is.na(demo_gender),!is.na(degreerecode)) ## lost 65 respondents here 
dynata_id%<>%left_join(regionrecodes,by=c("region"="demo_region"))

### create weights for dynata 
census<-read_csv("indonesia data/indonesia_census_cleaned.csv")
table(census$age)
census%<>%
  filter(age%in%c("00-04","05-09","10-14")==FALSE)%>%
  mutate(Value=ifelse(age=="15-19",Value*.4,Value))## adjust the 15-19 cell since our youngest respondents are only 18
centotals<-sum(census$Value)
region<-census%>%select(region,Value)%>%## collapse 2 regions that are absent from dynata sample: 
  mutate(region=case_when(region=="eastnusatenggara"~"westnusatenggara",
                          region=="maluku"~"northmaluku",
                          TRUE~region))%>%
  group_by(region)%>%
  summarise(Freq=sum(Value))%>%
  ungroup()%>%
  mutate(Freq=nrow(dynata_id)*Freq/centotals)%>% ## multiply the proportion of each cell by nrows in the sample so that we have the number of people in each cell 
  rename(demo_region_recode=region) ## rename so that the name is consistent with the region code (named acc to facebook's cells) in the sample data


age<-census%>%
  select(age,Value)%>%
  mutate(age=case_when(age%in%c("15-19","20-24","25-29")~"18-29",
                       age%in%c("30-34","35-39","40-44","45-49")~"30-49",
                       age%in%c("50-54","55-59")~"50-59",
                       age%in%c("60-64","65-69","70-74","75-79","80-84","85-89","90-94","95+")~"60+"))%>%
  group_by(age)%>%
  summarise(Freq=sum(Value))%>%
  ungroup()%>%
  mutate(Freq=nrow(dynata_id)*Freq/centotals)%>%
  rename(demo_agegroup=age)
gender<-census%>%
  select(gender,Value)%>%
  group_by(gender)%>%
  summarise(Freq=sum(Value))%>%
  ungroup()%>%
  mutate(Freq=nrow(dynata_id)*Freq/centotals,
         gender=str_to_title(gender))%>%
  rename(demo_gender=gender)

cenpop<-sum(degree$Freq)
degree%<>%
  mutate(Freq=Freq*nrow(dynata_id)/cenpop)

design<-svydesign(ids=~0,data=dynata_id)
rake1<-rake(design=design,
            sample.margins = list(~demo_gender,~demo_agegroup,~demo_region_recode,~degreerecode
            ),
            population=list(gender,age,region,degree
            ),control=list(maxit=100)) ## partial=TRUE ignores strata that are absent from sample (2 regions in this case)  
quantile(weights(rake1),probs=c(.05,.5,.95))

dynata_id$weight<-weights(rake1)

rake1 <- trimWeights(rake1, lower=-Inf, 
                     upper=quantile(weights(rake1),probs=c(.95)),strict=TRUE) ## trim weights

dynata_id$weight_trimmed<-weights(rake1)

write_csv(dynata_id, "indonesia data/dynata_cleaned.csv")

